%load_ext autoreload
%autoreload 2
# Reload dependencies
%reload_ext autoreload
This notebook is for comparing the performance of the base model vs. fine tuned model on the test dataset.
from utils_json import load_json_from_directory
import os
# The root directory where evaluations are stored
evals_directory_path_root = "./comparisons/"
# List all entries in the root directory
entries = os.listdir(evals_directory_path_root)
# Filter out only directories
evals_directory_paths = [os.path.join(evals_directory_path_root, entry) for entry in entries if os.path.isdir(os.path.join(evals_directory_path_root, entry))]
# Load the evaluations by directory path
evals = {}
for directory_path in evals_directory_paths:
evals[directory_path] = load_json_from_directory(directory_path=directory_path, as_dict=True)
print(f"Loaded evaluations from {directory_path}/")
Loaded evaluations from ./comparisons/240505_1826/ Loaded evaluations from ./comparisons/240505_1811/ Loaded evaluations from ./comparisons/240505_1842/
This dataframe is for summarizing the performance of the model by Run (Evaluation ID) by Program by Language by Example ID.
from utils_evals import evals_comparisons_to_outputs_df, pd
# Transform evaluations to a data frame
df_outputs = evals_comparisons_to_outputs_df(evals)
def format_model_name(model_name):
if "ft_gpt-3.5-turbo-0125" in model_name and "lg" not in model_name:
return "GPT-3.5 Turbo Fine-Tuned (Small)"
elif "ft_gpt-3.5-turbo-0125" in model_name and "lg" in model_name:
return "GPT-3.5 Turbo Fine-Tuned (Large)"
elif "gpt-4-turbo-2024-04-09" in model_name:
return "GPT-4 Turbo (Base Model)"
else:
return "GPT-3.5 Turbo (Base Model)"
def format_language(lang):
return str(lang).replace("-", " ").title()
# Format model names
df_outputs['lang'] = df_outputs['lang'].apply(format_language)
df_outputs['model_name'] = df_outputs['model_name'].apply(format_model_name)
import json
from utils_metrics import assess_performance, is_structured_representation_adhering_to_schema
ground_truth_examples = load_json_from_directory(directory_path="./../dataset/truth/json/", as_dict=True)
def compute_metrics(row):
# Double-decode JSON due to double-encoding in the data
prediction = json.loads(json.loads(row['pred_structured_representation']))
ground_truth = ground_truth_examples[row['example_id']]
# Compute performance metrics
if is_structured_representation_adhering_to_schema(prediction):
accuracy, precision, recall, f1_score = assess_performance(ground_truth, prediction)
else: # non-adherent schemas must fail completely
accuracy = precision = recall = f1_score = 0
# Return these metrics for addition to the DataFrame
return pd.Series([accuracy, precision, recall, f1_score])
# Apply the function and add the new columns to the DataFrame
df_outputs[['accuracy', 'precision', 'recall', 'sanity_check_f1_score']] = df_outputs.apply(compute_metrics, axis=1)
# Check if the f1_score and sanity_check_f1_score are the same
df_outputs['passes_f1_score_sanity_check'] = df_outputs['f1_score'] == df_outputs['sanity_check_f1_score']
# Display rows where the sanity check fails
failed_sanity_check = df_outputs[df_outputs['passes_f1_score_sanity_check'] == False]
if failed_sanity_check.count()[0] > 0:
print("Rows where sanity check fails:")
print(failed_sanity_check)
raise ValueError(f"{failed_sanity_check.count()[0]} row(s) with inconsistent F1 scores")
else:
print(f"Passed F1 score consistency sanity check")
Passed F1 score consistency sanity check
Given that the F1 score sanity checks passed, we can have increased confidence that the other metrics were accurately computed as well.
# Preview data frame
df_outputs
| eval_id | model_name | lang | example_id | f1_score | input_text | input_markdown | pred_structured_representation | accuracy | precision | recall | sanity_check_f1_score | passes_f1_score_sanity_check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 240505_1826 | GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | arabic_020 | 0.727273 | استطلاع الاحتياجات الاجتماعية\nفي تحالف كامبري... | استطلاع الاحتياجات الاجتماعية\n===\n\nفي تحالف... | "{\"title\": \"CHA Social Needs Questionnaire\... | 0.769231 | 0.800000 | 0.666667 | 0.727273 | True |
| 1 | 240505_1826 | GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | arabic_037 | 0.761905 | استطلاع الاحتياجات الاجتماعية\nفي تحالف كامبري... | استطلاع الاحتياجات الاجتماعية\n===\n\nفي تحالف... | "{\"title\": \"CHA Social Needs Questionnaire\... | 0.807692 | 0.800000 | 0.727273 | 0.761905 | True |
| 2 | 240505_1826 | GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | arabic_018 | 0.833333 | استطلاع الاحتياجات الاجتماعية\nفي تحالف كامبري... | استطلاع الاحتياجات الاجتماعية\n===\n\nفي تحالف... | "{\"title\": \"CHA Social Needs Questionnaire\... | 0.846154 | 0.909091 | 0.769231 | 0.833333 | True |
| 3 | 240505_1826 | GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | arabic_025 | 0.750000 | استطلاع الاحتياجات الاجتماعية\nفي تحالف كامبري... | استطلاع الاحتياجات الاجتماعية\n===\n\nفي تحالف... | "{\"title\": \"CHA Social Needs Questionnaire\... | 0.769231 | 0.818182 | 0.692308 | 0.750000 | True |
| 4 | 240505_1826 | GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | arabic_043 | 0.857143 | استطلاع الاحتياجات الاجتماعية\nفي تحالف كامبري... | استطلاع الاحتياجات الاجتماعية\n===\n\nفي تحالف... | "{\"title\": \"CHA Social Needs Questionnaire\... | 0.846154 | 0.923077 | 0.800000 | 0.857143 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 955 | 240505_1842 | GPT-4 Turbo (Base Model) | Spanish | spanish_006 | 1.000000 | Cuestionario de necesidades sociales\nEn CHA s... | Cuestionario de necesidades sociales\n===\n\nE... | "{\n \"title\": \"CHA Social Needs Questionna... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | True |
| 956 | 240505_1842 | GPT-4 Turbo (Base Model) | Spanish | spanish_040 | 1.000000 | Cuestionario de necesidades sociales\nEn CHA s... | Cuestionario de necesidades sociales\n===\n\nE... | "{\n \"title\": \"CHA Social Needs Question... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | True |
| 957 | 240505_1842 | GPT-4 Turbo (Base Model) | Spanish | spanish_008 | 0.933333 | Cuestionario de necesidades sociales\nEn CHA s... | Cuestionario de necesidades sociales\n===\n\nE... | "{\n \"title\": \"CHA Social Needs Question... | 0.923077 | 1.000000 | 0.875000 | 0.933333 | True |
| 958 | 240505_1842 | GPT-4 Turbo (Base Model) | Spanish | spanish_018 | 1.000000 | Cuestionario de necesidades sociales\nEn CHA s... | Cuestionario de necesidades sociales\n===\n\nE... | "{\n \"title\": \"CHA Social Needs Question... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | True |
| 959 | 240505_1842 | GPT-4 Turbo (Base Model) | Spanish | spanish_041 | 0.960000 | Cuestionario de necesidades sociales\nEn CHA s... | Cuestionario de necesidades sociales\n===\n\nE... | "{\n \"title\": \"CHA Social Needs Question... | 0.961538 | 0.923077 | 1.000000 | 0.960000 | True |
960 rows × 13 columns
Check whether the data frames have the expected number of rows
OUTPUTS_ROWS_EXPECTED_COUNT = 960 # 4 LLMs x 80 Examples x 3 Runs
if df_outputs.count()[0] != OUTPUTS_ROWS_EXPECTED_COUNT:
raise ValueError(f"Expected {OUTPUTS_ROWS_EXPECTED_COUNT} rows but only found {df_outputs.count()[0]} [df_outputs]")
else:
print(f"Loaded {df_outputs.count()[0]} of {OUTPUTS_ROWS_EXPECTED_COUNT} expected rows [df_outputs]")
Loaded 960 of 960 expected rows [df_outputs]
# General descriptive statistics for the f1_score column
print(df_outputs['f1_score'].describe())
count 960.000000 mean 0.923084 std 0.131545 min 0.200000 25% 0.900000 50% 1.000000 75% 1.000000 max 1.000000 Name: f1_score, dtype: float64
# Group by 'model_name' to see performance per model
model_performance = df_outputs.groupby('model_name')[['accuracy', 'precision', 'recall', 'f1_score']].describe()
model_performance
| accuracy | precision | ... | recall | f1_score | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| model_name | |||||||||||||||||||||
| GPT-3.5 Turbo (Base Model) | 240.0 | 0.881891 | 0.138960 | 0.423077 | 0.846154 | 0.923077 | 1.0 | 1.0 | 240.0 | 0.879733 | ... | 1.0 | 1.0 | 240.0 | 0.861667 | 0.172308 | 0.200000 | 0.826572 | 0.916667 | 1.0 | 1.0 |
| GPT-3.5 Turbo Fine-Tuned (Large) | 240.0 | 0.952724 | 0.077580 | 0.692308 | 0.923077 | 1.000000 | 1.0 | 1.0 | 240.0 | 0.965800 | ... | 1.0 | 1.0 | 240.0 | 0.943410 | 0.097155 | 0.571429 | 0.928571 | 1.000000 | 1.0 | 1.0 |
| GPT-3.5 Turbo Fine-Tuned (Small) | 240.0 | 0.959936 | 0.073364 | 0.692308 | 0.923077 | 1.000000 | 1.0 | 1.0 | 240.0 | 0.968208 | ... | 1.0 | 1.0 | 240.0 | 0.954763 | 0.084365 | 0.600000 | 0.932143 | 1.000000 | 1.0 | 1.0 |
| GPT-4 Turbo (Base Model) | 240.0 | 0.939423 | 0.122757 | 0.423077 | 0.951923 | 1.000000 | 1.0 | 1.0 | 240.0 | 0.939729 | ... | 1.0 | 1.0 | 240.0 | 0.932498 | 0.133744 | 0.363636 | 0.937500 | 1.000000 | 1.0 | 1.0 |
4 rows × 32 columns
# Group by both 'model_name' and 'lang' to see performance per model and language
model_lang_performance = df_outputs.groupby(['model_name', 'lang'])[['accuracy', 'precision', 'recall', 'f1_score']].describe()
model_lang_performance
| accuracy | precision | ... | recall | f1_score | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | ||
| model_name | lang | |||||||||||||||||||||
| GPT-3.5 Turbo (Base Model) | Arabic | 30.0 | 0.593590 | 0.113235 | 0.423077 | 0.509615 | 0.576923 | 0.653846 | 0.884615 | 30.0 | 0.589697 | ... | 0.553030 | 0.769231 | 30.0 | 0.513006 | 0.147965 | 0.250000 | 0.400000 | 0.521739 | 0.628148 | 0.800000 |
| Chinese | 30.0 | 0.979487 | 0.029849 | 0.884615 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.978249 | ... | 1.000000 | 1.000000 | 30.0 | 0.976302 | 0.038099 | 0.857143 | 0.958771 | 1.000000 | 1.000000 | 1.000000 | |
| English | 30.0 | 0.978205 | 0.043659 | 0.846154 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.991230 | ... | 1.000000 | 1.000000 | 30.0 | 0.981489 | 0.036755 | 0.866667 | 0.967742 | 1.000000 | 1.000000 | 1.000000 | |
| Haitian Creole | 30.0 | 0.896154 | 0.058155 | 0.807692 | 0.846154 | 0.923077 | 0.923077 | 1.000000 | 30.0 | 0.887249 | ... | 0.933333 | 1.000000 | 30.0 | 0.886246 | 0.069798 | 0.750000 | 0.827586 | 0.898990 | 0.933333 | 1.000000 | |
| Hindi | 30.0 | 0.937179 | 0.051979 | 0.846154 | 0.884615 | 0.923077 | 1.000000 | 1.000000 | 30.0 | 0.900806 | ... | 1.000000 | 1.000000 | 30.0 | 0.920133 | 0.083370 | 0.727273 | 0.880000 | 0.930952 | 1.000000 | 1.000000 | |
| Nepali | 30.0 | 0.807692 | 0.106894 | 0.615385 | 0.701923 | 0.826923 | 0.884615 | 1.000000 | 30.0 | 0.826748 | ... | 0.842949 | 1.000000 | 30.0 | 0.764198 | 0.156276 | 0.200000 | 0.666667 | 0.782609 | 0.878205 | 1.000000 | |
| Portuguese | 30.0 | 0.915385 | 0.057493 | 0.807692 | 0.884615 | 0.923077 | 0.923077 | 1.000000 | 30.0 | 0.912621 | ... | 0.933333 | 1.000000 | 30.0 | 0.912860 | 0.056722 | 0.827586 | 0.866667 | 0.909091 | 0.933333 | 1.000000 | |
| Spanish | 30.0 | 0.947436 | 0.050988 | 0.769231 | 0.923077 | 0.961538 | 1.000000 | 1.000000 | 30.0 | 0.951262 | ... | 1.000000 | 1.000000 | 30.0 | 0.939103 | 0.058504 | 0.812500 | 0.909091 | 0.956522 | 1.000000 | 1.000000 | |
| GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | 30.0 | 0.774359 | 0.050232 | 0.692308 | 0.730769 | 0.769231 | 0.807692 | 0.846154 | 30.0 | 0.816222 | ... | 0.731818 | 0.800000 | 30.0 | 0.726998 | 0.088564 | 0.571429 | 0.695652 | 0.734007 | 0.779762 | 0.857143 |
| Chinese | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 30.0 | 0.984615 | 0.031295 | 0.923077 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.993750 | ... | 1.000000 | 1.000000 | 30.0 | 0.983750 | 0.034138 | 0.900000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Haitian Creole | 30.0 | 0.992308 | 0.015648 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 0.988986 | 0.023018 | 0.933333 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Hindi | 30.0 | 0.987179 | 0.023326 | 0.923077 | 0.971154 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 0.985441 | 0.029951 | 0.888889 | 0.974138 | 1.000000 | 1.000000 | 1.000000 | |
| Nepali | 30.0 | 0.939744 | 0.050182 | 0.846154 | 0.884615 | 0.942308 | 1.000000 | 1.000000 | 30.0 | 0.966905 | ... | 1.000000 | 1.000000 | 30.0 | 0.923245 | 0.067590 | 0.800000 | 0.882222 | 0.926537 | 1.000000 | 1.000000 | |
| Portuguese | 30.0 | 0.961538 | 0.042853 | 0.884615 | 0.923077 | 0.980769 | 1.000000 | 1.000000 | 30.0 | 0.957857 | ... | 1.000000 | 1.000000 | 30.0 | 0.957557 | 0.049651 | 0.857143 | 0.928571 | 0.978261 | 1.000000 | 1.000000 | |
| Spanish | 30.0 | 0.982051 | 0.026209 | 0.923077 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.991667 | ... | 1.000000 | 1.000000 | 30.0 | 0.981306 | 0.028040 | 0.916667 | 0.965517 | 1.000000 | 1.000000 | 1.000000 | |
| GPT-3.5 Turbo Fine-Tuned (Small) | Arabic | 30.0 | 0.794872 | 0.063347 | 0.692308 | 0.730769 | 0.807692 | 0.846154 | 0.884615 | 30.0 | 0.853874 | ... | 0.775641 | 0.846154 | 30.0 | 0.758858 | 0.069731 | 0.600000 | 0.728788 | 0.769841 | 0.782609 | 0.880000 |
| Chinese | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 30.0 | 0.984615 | 0.031295 | 0.923077 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.987500 | ... | 1.000000 | 1.000000 | 30.0 | 0.987500 | 0.025427 | 0.937500 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Haitian Creole | 30.0 | 0.996154 | 0.011736 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 0.993333 | 0.020342 | 0.933333 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Hindi | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Nepali | 30.0 | 0.973077 | 0.035207 | 0.923077 | 0.923077 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.975641 | ... | 1.000000 | 1.000000 | 30.0 | 0.969759 | 0.038220 | 0.916667 | 0.923077 | 1.000000 | 1.000000 | 1.000000 | |
| Portuguese | 30.0 | 0.930769 | 0.042133 | 0.846154 | 0.923077 | 0.923077 | 0.923077 | 1.000000 | 30.0 | 0.928650 | ... | 0.933333 | 1.000000 | 30.0 | 0.928650 | 0.040437 | 0.866667 | 0.909091 | 0.919872 | 0.933333 | 1.000000 | |
| Spanish | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| GPT-4 Turbo (Base Model) | Arabic | 30.0 | 0.680769 | 0.131365 | 0.423077 | 0.615385 | 0.673077 | 0.759615 | 0.923077 | 30.0 | 0.692730 | ... | 0.685897 | 0.888889 | 30.0 | 0.650310 | 0.123824 | 0.363636 | 0.571429 | 0.637218 | 0.740741 | 0.888889 |
| Chinese | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 30.0 | 0.983333 | 0.026112 | 0.923077 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.984659 | ... | 1.000000 | 1.000000 | 30.0 | 0.984613 | 0.023289 | 0.937500 | 0.967742 | 1.000000 | 1.000000 | 1.000000 | |
| Haitian Creole | 30.0 | 0.983333 | 0.029764 | 0.923077 | 0.971154 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.989167 | ... | 1.000000 | 1.000000 | 30.0 | 0.985687 | 0.025368 | 0.933333 | 0.977273 | 1.000000 | 1.000000 | 1.000000 | |
| Hindi | 30.0 | 0.993590 | 0.017736 | 0.923077 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.997778 | ... | 1.000000 | 1.000000 | 30.0 | 0.986667 | 0.035306 | 0.888889 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Nepali | 30.0 | 0.896154 | 0.128618 | 0.538462 | 0.807692 | 0.942308 | 1.000000 | 1.000000 | 30.0 | 0.874523 | ... | 1.000000 | 1.000000 | 30.0 | 0.876442 | 0.149422 | 0.500000 | 0.782609 | 0.939799 | 1.000000 | 1.000000 | |
| Portuguese | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 1.000000 | ... | 1.000000 | 1.000000 | 30.0 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Spanish | 30.0 | 0.978205 | 0.034524 | 0.884615 | 0.961538 | 1.000000 | 1.000000 | 1.000000 | 30.0 | 0.978974 | ... | 1.000000 | 1.000000 | 30.0 | 0.976266 | 0.038956 | 0.888889 | 0.960000 | 1.000000 | 1.000000 | 1.000000 | |
32 rows × 32 columns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def plot_overall_performance(df):
"""
Plot the performance metrics for each model grouped by 'model_name' using pastel colors and adjusted legend.
Args:
df (DataFrame): DataFrame containing 'model_name', 'accuracy', 'precision', 'recall', 'f1_score'.
"""
df = df.copy()
# Calculate mean for all metrics for each model
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
model_metrics = df.groupby('model_name')[metrics].mean()
# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
# The position of bars on the x-axis
position = np.arange(len(model_metrics))
bar_width = 0.2 # Width of bars
# Pastel colors for the bars
colors = ['#ffb3ba', '#ffdfba', '#ffffba', '#baffc9']
# Create bars for each metric
for i, metric in enumerate(metrics):
ax.bar(position + i * bar_width, model_metrics[metric], width=bar_width, label=metric.replace('_', ' ').title(),
color=colors[i], edgecolor='black')
# Formatting and layout adjustments
ax.set_xticks(position + bar_width * (len(metrics) - 1) / 2) # Center the ticks between groups
ax.set_xticklabels(model_metrics.index)
ax.set_xlabel('Model')
ax.set_ylabel('Metric Scores')
ax.set_title('Overall Model Performance by Metric')
# Set y-axis ticks for smaller increments
ax.set_yticks(np.arange(0, 1.01, 0.1))
# Adjust legend position
ax.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()
plot_overall_performance(df_outputs)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib as mpl
def plot_model_lang_performance_grid_2x2(df):
"""
Adjusted plot function to correctly manage legends without warnings and to sort the bars within each subplot
from worst to best based on the metric value.
"""
df = df.copy()
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
model_lang_metrics = df.groupby(['model_name', 'lang'])[metrics].mean()
# Setup dynamic colors for languages
unique_langs = df['lang'].unique()
color_map = mpl.colormaps['tab20']
colors = color_map(np.linspace(0, 1, len(unique_langs)))
lang_colors = dict(zip(unique_langs, colors))
# Create 2x2 grid of subplots with appropriate size
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12))
for ax, metric in zip(axes.flatten(), metrics):
# Retrieve data sorted by the metric for each subplot
sorted_data = model_lang_metrics.sort_values(by=[metric], ascending=True)
models = sorted_data.index.get_level_values(0).unique()
num_models = len(models)
num_languages = len(unique_langs)
bar_width = 0.8 / num_languages
for j, model in enumerate(models):
data = sorted_data.xs(model, level=0)
model_pos = np.arange(num_models) * (1 + 0.1)
for k, (lang, row) in enumerate(data.iterrows()):
bar_pos = model_pos[j] + k * bar_width
rect = ax.bar(bar_pos, row[metric], width=bar_width, color=lang_colors[lang],
label=f"{lang} ({metric})" if (j == 0 and k == 0) else "")
ax.set_title(f'{metric.title().replace("_", " ")}')
ax.set_xticks(model_pos + bar_width * (num_languages - 1) / 2)
ax.set_xticklabels(models, rotation=45, ha="right")
ax.set_ylabel(metric.title().replace("_", " "))
# Manage legends from all subplots
handles, labels = [], []
for lang, color in lang_colors.items():
handles.append(plt.Rectangle((0, 0), 1, 1, color=color))
labels.append(lang)
fig.legend(handles, labels, title="Language", loc='upper left', bbox_to_anchor=(1.01, 0.97))
plt.tight_layout()
plt.show()
# Assuming df_outputs is your DataFrame containing the performance data
plot_model_lang_performance_grid_2x2(df_outputs)
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
def plot_vertical_performance_by_language(df):
df = df.copy()
unique_languages = df['lang'].unique()
color_map = mpl.colormaps['tab20']
colors = color_map(np.linspace(0, 1, len(unique_languages)))
language_color_dict = dict(zip(unique_languages, colors))
df['color'] = df['lang'].map(language_color_dict)
df['label'] = df['model_name'] + ' / ' + df['lang']
fig, ax = plt.subplots(figsize=(14, 12))
whiskerprops = dict(linestyle='--', linewidth=2, color='grey')
capprops = dict(linewidth=2, color='black')
medianprops = dict(linestyle='-', linewidth=2.5, color='firebrick')
flierprops = dict(marker='o', color='black', markersize=5)
sorted_labels = sorted(df['label'].unique(), key=lambda x: (x.split(' / ')[1], x.split(' / ')[0]))
positions = {}
current_position = 0
last_language = None
for label in sorted_labels:
parts = label.split(' / ')
if parts[1] != last_language:
if last_language is not None: # Increment to create a gap between different languages
current_position += 2
last_language = parts[1]
positions[label] = current_position
current_position += 1
legend_handles = {}
for label in sorted_labels:
data = df[df['label'] == label]['f1_score']
if not data.empty:
color = df[df['label'] == label]['color'].iloc[0]
box = ax.boxplot(data, positions=[positions[label]],
patch_artist=True, boxprops=dict(facecolor=color, color=color),
whiskerprops=whiskerprops, capprops=capprops,
medianprops=medianprops, flierprops=flierprops, widths=0.3)
key_label = label.split(' / ')[1]
legend_handles[key_label] = mpl.patches.Patch(color=color, label=key_label)
ax.set_title('Model Performance by Language')
ax.set_xlabel('Model / Language')
ax.set_ylabel('F1 Score')
ax.set_xticks(list(positions.values()))
ax.set_xticklabels(sorted_labels, rotation=45, ha='right')
plt.legend(handles=legend_handles.values(), title="Languages", loc='upper right', bbox_to_anchor=(1.2, 1))
plt.tight_layout()
plt.show()
# Visualize
plot_vertical_performance_by_language(df_outputs)
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
def plot_horizontal_performance_by_language(df):
df = df.copy()
unique_languages = df['lang'].unique()
color_map = mpl.colormaps['tab20']
colors = color_map(np.linspace(0, 1, len(unique_languages)))
language_color_dict = dict(zip(unique_languages, colors))
df['color'] = df['lang'].map(language_color_dict)
df['label'] = df['model_name'] + ' / ' + df['lang']
fig, ax = plt.subplots(figsize=(14, 12))
whiskerprops = dict(linestyle='--', linewidth=2, color='grey')
capprops = dict(linewidth=2, color='black')
medianprops = dict(linestyle='-', linewidth=2.5, color='firebrick')
flierprops = dict(marker='o', color='black', markersize=5)
sorted_labels = sorted(df['label'].unique(), key=lambda x: (x.split(' / ')[1], x.split(' / ')[0]))
positions = {}
current_position = 0
last_language = None
for label in sorted_labels:
parts = label.split(' / ')
if parts[1] != last_language:
if last_language is not None:
current_position += 2 # Adding a larger gap for visual separation
last_language = parts[1]
positions[label] = current_position
current_position += 1
legend_handles = {}
for label in sorted_labels:
data = df[df['label'] == label]['f1_score']
if not data.empty:
color = df[df['label'] == label]['color'].iloc[0]
box = ax.boxplot(data, positions=[positions[label]], vert=False,
patch_artist=True, boxprops=dict(facecolor=color, color=color),
whiskerprops=whiskerprops, capprops=capprops,
medianprops=medianprops, flierprops=flierprops, widths=0.6)
key_label = label.split(' / ')[1]
legend_handles[key_label] = mpl.patches.Patch(color=color, label=key_label)
ax.set_title('Model Performance by Language')
ax.set_ylabel('Model / Language')
ax.set_xlabel('F1 Score')
ax.set_yticks(list(positions.values()))
ax.set_yticklabels(sorted_labels, rotation=0, ha='right')
plt.legend(handles=legend_handles.values(), title="Languages", loc='upper right', bbox_to_anchor=(1.2, 1))
plt.tight_layout()
plt.show()
# Visualize
plot_horizontal_performance_by_language(df_outputs)
import pandas as pd
def create_performance_table(df):
"""
Creates a performance table grouped by model and language, including an 'Overall' summary for each model,
without altering the original DataFrame.
Args:
df (DataFrame): DataFrame containing model performance data.
Returns:
DataFrame: A new DataFrame formatted with individual and overall performance metrics.
"""
data = df.copy()
# Calculate metrics for each language within each model
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
language_metrics = data.groupby(['model_name', 'lang'])[metrics].mean().reset_index()
# Calculate overall metrics for each model
overall_metrics = data.groupby('model_name')[metrics].mean().reset_index()
overall_metrics['lang'] = 'Overall'
# Combine both sets of metrics
final_data = pd.concat([language_metrics, overall_metrics], ignore_index=True)
# Ensure 'Overall' is listed last for each model
final_data['lang'] = pd.Categorical(final_data['lang'],
categories=sorted(set(final_data['lang']) - {'Overall'}) + ['Overall'],
ordered=True)
final_data.sort_values(by=['model_name', 'lang'], inplace=True)
# Set a multi-level index for better organization
final_data.set_index(['model_name', 'lang'], inplace=True)
return final_data
performance_table = create_performance_table(df_outputs)
performance_table.to_clipboard(excel=True)
performance_table
| accuracy | precision | recall | f1_score | ||
|---|---|---|---|---|---|
| model_name | lang | ||||
| GPT-3.5 Turbo (Base Model) | Arabic | 0.593590 | 0.589697 | 0.462213 | 0.513006 |
| Chinese | 0.979487 | 0.978249 | 0.975370 | 0.976302 | |
| English | 0.978205 | 0.991230 | 0.972500 | 0.981489 | |
| Haitian Creole | 0.896154 | 0.887249 | 0.886499 | 0.886246 | |
| Hindi | 0.937179 | 0.900806 | 0.942222 | 0.920133 | |
| Nepali | 0.807692 | 0.826748 | 0.739724 | 0.764198 | |
| Portuguese | 0.915385 | 0.912621 | 0.914071 | 0.912860 | |
| Spanish | 0.947436 | 0.951262 | 0.929646 | 0.939103 | |
| Overall | 0.881891 | 0.879733 | 0.852781 | 0.861667 | |
| GPT-3.5 Turbo Fine-Tuned (Large) | Arabic | 0.774359 | 0.816222 | 0.659843 | 0.726998 |
| Chinese | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 0.984615 | 0.993750 | 0.975568 | 0.983750 | |
| Haitian Creole | 0.992308 | 1.000000 | 0.979167 | 0.988986 | |
| Hindi | 0.987179 | 1.000000 | 0.972857 | 0.985441 | |
| Nepali | 0.939744 | 0.966905 | 0.890647 | 0.923245 | |
| Portuguese | 0.961538 | 0.957857 | 0.958009 | 0.957557 | |
| Spanish | 0.982051 | 0.991667 | 0.971742 | 0.981306 | |
| Overall | 0.952724 | 0.965800 | 0.925979 | 0.943410 | |
| GPT-3.5 Turbo Fine-Tuned (Small) | Arabic | 0.794872 | 0.853874 | 0.688175 | 0.758858 |
| Chinese | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 0.984615 | 0.987500 | 0.987500 | 0.987500 | |
| Haitian Creole | 0.996154 | 1.000000 | 0.987500 | 0.993333 | |
| Hindi | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Nepali | 0.973077 | 0.975641 | 0.964530 | 0.969759 | |
| Portuguese | 0.930769 | 0.928650 | 0.928650 | 0.928650 | |
| Spanish | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Overall | 0.959936 | 0.968208 | 0.944544 | 0.954763 | |
| GPT-4 Turbo (Base Model) | Arabic | 0.680769 | 0.692730 | 0.619111 | 0.650310 |
| Chinese | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| English | 0.983333 | 0.984659 | 0.985278 | 0.984613 | |
| Haitian Creole | 0.983333 | 0.989167 | 0.982444 | 0.985687 | |
| Hindi | 0.993590 | 0.997778 | 0.977778 | 0.986667 | |
| Nepali | 0.896154 | 0.874523 | 0.879468 | 0.876442 | |
| Portuguese | 1.000000 | 1.000000 | 1.000000 | 1.000000 | |
| Spanish | 0.978205 | 0.978974 | 0.974537 | 0.976266 | |
| Overall | 0.939423 | 0.939729 | 0.927327 | 0.932498 |
from utils_consistency import create_example_id_consistency_table
consistency_table = create_example_id_consistency_table(df_outputs)
consistency_table
| accuracy_mean | precision_mean | recall_mean | f1_score_mean | accuracy_std | precision_std | recall_std | f1_score_std | count | |||
|---|---|---|---|---|---|---|---|---|---|---|---|
| lang | example_id | model_name | |||||||||
| Arabic | arabic_006 | GPT-3.5 Turbo (Base Model) | 0.807692 | 0.833333 | 0.555556 | 0.666667 | 0.076923 | 0.166667 | 0.111111 | 0.133333 | 3.0 |
| GPT-3.5 Turbo Fine-Tuned (Large) | 0.769231 | 0.800000 | 0.444444 | 0.571429 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | ||
| GPT-3.5 Turbo Fine-Tuned (Small) | 0.846154 | 0.777778 | 0.777778 | 0.777778 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | ||
| GPT-4 Turbo (Base Model) | 0.756410 | 0.644444 | 0.666667 | 0.654971 | 0.022206 | 0.038490 | 0.000000 | 0.020258 | 3.0 | ||
| arabic_014 | GPT-3.5 Turbo (Base Model) | 0.500000 | 0.500000 | 0.307692 | 0.380952 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Spanish | spanish_041 | GPT-4 Turbo (Base Model) | 0.961538 | 0.923077 | 1.000000 | 0.960000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 |
| spanish_049 | GPT-3.5 Turbo (Base Model) | 0.923077 | 0.909091 | 0.909091 | 0.909091 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | |
| GPT-3.5 Turbo Fine-Tuned (Large) | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | ||
| GPT-3.5 Turbo Fine-Tuned (Small) | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | ||
| GPT-4 Turbo (Base Model) | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 |
320 rows × 9 columns
from utils_consistency import plot_consistency_heatmaps
plot_consistency_heatmaps(consistency_table)
from utils_consistency import subplots_consistency_heatmaps
subplots_consistency_heatmaps(consistency_table, height_multiplier=7)
from utils_print_examples import print_challenging_examples_for_each_language
print_challenging_examples_for_each_language(df_outputs, ground_truths=ground_truth_examples, f1_score_cutoff=0.5)
Language: Arabic, Model: GPT-3.5 Turbo (Base Model), Example ID: arabic_035 Performance Metrics - Accuracy: 0.5384615384615384, Precision: 0.2857142857142857, Recall: 0.2222222222222222, F1 Score: 0.25 Key Differences: Question: 'In the last 12 months, did you worry your food would run out before you got money to buy more?' | Choice 'Often' (ID: 3) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Never' (ID: 4) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Sometimes' (ID: 5) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Often' (ID: 6) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'No' (ID: 7) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Yes' (ID: 8) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you or your family had trouble getting transportation to medical appointments?' | Choice 'Yes' (ID: 11) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I have a place to live today, but I am at risk of losing my housing.' (ID: 13) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I am living outside, in a car, abandoned building, or bus/train station.' (ID: 15) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'Yes, on my phone' (ID: 17) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'No' (ID: 18) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'No' (ID: 19) - Gold Standard: SELECTED, Prediction: UNSELECTED ================================================== Language: Arabic, Model: GPT-3.5 Turbo (Base Model), Example ID: arabic_035 Performance Metrics - Accuracy: 0.5769230769230769, Precision: 0.3333333333333333, Recall: 0.2222222222222222, F1 Score: 0.26666666666666666 Key Differences: Question: 'In the last 12 months, did you worry your food would run out before you got money to buy more?' | Choice 'Sometimes' (ID: 2) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, did you worry your food would run out before you got money to buy more?' | Choice 'Often' (ID: 3) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Never' (ID: 4) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Sometimes' (ID: 5) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Often' (ID: 6) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you or your family had trouble getting transportation to medical appointments?' | Choice 'Yes' (ID: 11) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I have a place to live today, but I am at risk of losing my housing.' (ID: 13) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I am living outside, in a car, abandoned building, or bus/train station.' (ID: 15) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'Yes, on my phone' (ID: 17) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'No' (ID: 18) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'No' (ID: 19) - Gold Standard: SELECTED, Prediction: UNSELECTED ================================================== Language: Arabic, Model: GPT-3.5 Turbo (Base Model), Example ID: arabic_020 Performance Metrics - Accuracy: 0.5, Precision: 0.42857142857142855, Recall: 0.25, F1 Score: 0.3157894736842105 Key Differences: Question: 'In the last 12 months, did you worry your food would run out before you got money to buy more?' | Choice 'Often' (ID: 3) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Often' (ID: 6) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Yes' (ID: 8) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Already shut off' (ID: 9) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I have a steady place to live.' (ID: 12) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'What is your living situation today?' | Choice 'I am temporarily staying with others, in a hotel, or a shelter.' (ID: 14) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'No' (ID: 18) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'No' (ID: 19) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Yes' (ID: 20) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Prefer not to say' (ID: 21) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'Yes' (ID: 22) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'No' (ID: 23) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Can we refer you to free or low-cost community programs (like food pantries) by sharing your name, phone, and address so they can reach you?' | Choice 'Yes' (ID: 24) - Gold Standard: SELECTED, Prediction: UNSELECTED ================================================== Language: Arabic, Model: GPT-3.5 Turbo (Base Model), Example ID: arabic_020 Performance Metrics - Accuracy: 0.5384615384615384, Precision: 0.5, Recall: 0.25, F1 Score: 0.3333333333333333 Key Differences: Question: 'In the last 12 months, did you worry your food would run out before you got money to buy more?' | Choice 'Often' (ID: 3) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Often' (ID: 6) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Yes' (ID: 8) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Already shut off' (ID: 9) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I have a steady place to live.' (ID: 12) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'What is your living situation today?' | Choice 'I am temporarily staying with others, in a hotel, or a shelter.' (ID: 14) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'No' (ID: 18) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'No' (ID: 19) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Yes' (ID: 20) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Prefer not to say' (ID: 21) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'Yes' (ID: 22) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Can we refer you to free or low-cost community programs (like food pantries) by sharing your name, phone, and address so they can reach you?' | Choice 'Yes' (ID: 24) - Gold Standard: SELECTED, Prediction: UNSELECTED ================================================== Language: Arabic, Model: GPT-4 Turbo (Base Model), Example ID: arabic_025 Performance Metrics - Accuracy: 0.46153846153846156, Precision: 0.4444444444444444, Recall: 0.3076923076923077, F1 Score: 0.3636363636363637 Key Differences: Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Sometimes' (ID: 5) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Often' (ID: 6) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'No' (ID: 7) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Yes' (ID: 8) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Already shut off' (ID: 9) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you or your family had trouble getting transportation to medical appointments?' | Choice 'No' (ID: 10) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'No' (ID: 18) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'No' (ID: 19) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Yes' (ID: 20) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, have you experienced violence at home or in your relationships? [Ambulatory Only]' | Choice 'Prefer not to say' (ID: 21) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'Yes' (ID: 22) - Gold Standard: UNSELECTED, Prediction: SELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'No' (ID: 23) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Can we refer you to free or low-cost community programs (like food pantries) by sharing your name, phone, and address so they can reach you?' | Choice 'No' (ID: 25) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Declined the questionnaire?' | Choice 'I do not want to answer these questions.' (ID: 26) - Gold Standard: SELECTED, Prediction: UNSELECTED ================================================== Language: Nepali, Model: GPT-3.5 Turbo (Base Model), Example ID: nepali_043 Performance Metrics - Accuracy: 0.6923076923076923, Precision: 1.0, Recall: 0.1111111111111111, F1 Score: 0.19999999999999998 Key Differences: Question: 'In the last 12 months, the food we bought didn't last and we didn't have money to get more.' | Choice 'Sometimes' (ID: 5) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Yes' (ID: 8) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'In the last 12 months, has the electric, gas or oil company threatened to shut off services in your home?' | Choice 'Already shut off' (ID: 9) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I have a place to live today, but I am at risk of losing my housing.' (ID: 13) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'What is your living situation today?' | Choice 'I am living outside, in a car, abandoned building, or bus/train station.' (ID: 15) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Do you have access to the internet at home? (Select all that apply)' | Choice 'Yes, on my phone' (ID: 17) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Would you like a CHA care team member to reach out to help you with the needs you checked off above?' | Choice 'No' (ID: 23) - Gold Standard: SELECTED, Prediction: UNSELECTED Question: 'Can we refer you to free or low-cost community programs (like food pantries) by sharing your name, phone, and address so they can reach you?' | Choice 'Yes' (ID: 24) - Gold Standard: SELECTED, Prediction: UNSELECTED ==================================================